Velocyto (implemented in R)

Chromaffin differentiation analysis

元URL: http://pklab.med.harvard.edu/velocyto/notebooks/R/chromaffin2.nb.html

library(velocyto.R)
## Loading required package: Matrix

データ読み込み

# 発現量データ
ldat <- readRDS(url("http://pklab.med.harvard.edu/velocyto/chromaffin/ldat.rds"))
# 細胞種アノテーション
cell.colors <- readRDS(url("http://pklab.med.harvard.edu/velocyto/chromaffin/cell.colors.rds"))
# 次元削減後の数値
emb <- readRDS(url("http://pklab.med.harvard.edu/velocyto/chromaffin/embedding.rds"))
# str関数でオブジェクト ldat の内容を情報付きで表示する
str(ldat)
## List of 4
##  $ spliced  :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   .. ..@ i       : int [1:2724270] 3 9 10 13 14 17 19 20 28 29 ...
##   .. ..@ p       : int [1:385] 0 7100 14272 21376 28491 36017 43375 50956 57637 64998 ...
##   .. ..@ Dim     : int [1:2] 23420 384
##   .. ..@ Dimnames:List of 2
##   .. .. ..$ : chr [1:23420] "Xkr4" "Rp1" "Sox17" "Mrpl15" ...
##   .. .. ..$ : chr [1:384] "onefilepercell_A1_unique_and_others_J2CH1:A1_unique.bam" "onefilepercell_A1_unique_and_others_J2CH1:A10_unique.bam" "onefilepercell_A1_unique_and_others_J2CH1:A11_unique.bam" "onefilepercell_A1_unique_and_others_J2CH1:A12_unique.bam" ...
##   .. ..@ x       : num [1:2724270] 24 3 22 136 48 21 15 32 109 13 ...
##   .. ..@ factors : list()
##  $ unspliced:Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   .. ..@ i       : int [1:1528882] 0 4 10 13 14 17 19 20 23 29 ...
##   .. ..@ p       : int [1:385] 0 3911 8043 12057 16176 20795 25046 29389 33438 37382 ...
##   .. ..@ Dim     : int [1:2] 23420 384
##   .. ..@ Dimnames:List of 2
##   .. .. ..$ : chr [1:23420] "Xkr4" "Rp1" "Sox17" "Mrpl15" ...
##   .. .. ..$ : chr [1:384] "onefilepercell_A1_unique_and_others_J2CH1:A1_unique.bam" "onefilepercell_A1_unique_and_others_J2CH1:A10_unique.bam" "onefilepercell_A1_unique_and_others_J2CH1:A11_unique.bam" "onefilepercell_A1_unique_and_others_J2CH1:A12_unique.bam" ...
##   .. ..@ x       : num [1:1528882] 5 23 4 6 8 2 70 14 1 15 ...
##   .. ..@ factors : list()
##  $ ambiguous:Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   .. ..@ i       : int [1:198206] 3 19 30 121 144 145 188 295 366 387 ...
##   .. ..@ p       : int [1:385] 0 499 993 1562 2070 2639 3184 3785 4321 4821 ...
##   .. ..@ Dim     : int [1:2] 23420 384
##   .. ..@ Dimnames:List of 2
##   .. .. ..$ : chr [1:23420] "Xkr4" "Rp1" "Sox17" "Mrpl15" ...
##   .. .. ..$ : chr [1:384] "onefilepercell_A1_unique_and_others_J2CH1:A1_unique.bam" "onefilepercell_A1_unique_and_others_J2CH1:A10_unique.bam" "onefilepercell_A1_unique_and_others_J2CH1:A11_unique.bam" "onefilepercell_A1_unique_and_others_J2CH1:A12_unique.bam" ...
##   .. ..@ x       : num [1:198206] 17 1 14 24 9 1 5 23 1 2 ...
##   .. ..@ factors : list()
##  $ spanning :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   .. ..@ i       : int [1:443099] 3 17 40 80 111 132 146 161 200 216 ...
##   .. ..@ p       : int [1:385] 0 1089 2250 3556 4768 6201 7494 8878 10237 11349 ...
##   .. ..@ Dim     : int [1:2] 23420 384
##   .. ..@ Dimnames:List of 2
##   .. .. ..$ : chr [1:23420] "Xkr4" "Rp1" "Sox17" "Mrpl15" ...
##   .. .. ..$ : chr [1:384] "onefilepercell_A1_unique_and_others_J2CH1:A1_unique.bam" "onefilepercell_A1_unique_and_others_J2CH1:A10_unique.bam" "onefilepercell_A1_unique_and_others_J2CH1:A11_unique.bam" "onefilepercell_A1_unique_and_others_J2CH1:A12_unique.bam" ...
##   .. ..@ x       : num [1:443099] 1 2 1 3 2 1 1 1 1 1 ...
##   .. ..@ factors : list()
# 細胞名から余分な文字列を削除
ldat <- lapply(ldat,function(x) {
  colnames(x) <-  gsub("_unique.bam","",gsub(".*:","",colnames(x)))
  x
})

Gene filtering

#Spliced expression magnitude distribution across genes:
hist(log10(rowSums(ldat$spliced) +1),
     col = 'wheat',
     xlab = 'log10[ number of reads + 1]',
     main = 'number of reads per gene'
    )

発現量行列から平均発現量が g (pre-defined) を超えない遺伝子を除去

# exonic read (spliced) expression matrix
emat <- ldat$spliced;

# intronic read (unspliced) expression matrix
nmat <- ldat$unspliced

# spanning read (intron+exon) expression matrix
smat <- ldat$spanning;

# filter expression matrices based on some minimum max-cluster averages
emat <- filter.genes.by.cluster.expression(emat, cell.colors,min.max.cluster.average = 5)
nmat <- filter.genes.by.cluster.expression(nmat, cell.colors,min.max.cluster.average = 1)
smat <- filter.genes.by.cluster.expression(smat, cell.colors,min.max.cluster.average = 0.5)
# look at the resulting gene set
length(intersect(rownames(emat), rownames(nmat)))
## [1] 8548
# and if we use spanning reads (smat)
length(intersect( intersect(rownames(emat), rownames(nmat)), rownames(smat)) )
## [1] 1696

RNA velocityの推定

Several variants of velocity estimates using gene-relative model

細胞の kNN poolingと 極端なクォンタイルに基づくガンマフィットを組み合わせた頑健な推定手法を用いる。

min/max quantile fitを使うと、この場合、遺伝子特異的なオフセットはspanning read (smat)のフィットを必要としません。 ここでは、(spliced f expressionレベルで)上位/下位 5%に基づいてフィットを行います。

fit.quantile <- 0.05;
rvel.qf <- gene.relative.velocity.estimates(emat,
                                            nmat,
                                            deltaT=1,
                                            kCells = 5,
                                            fit.quantile = fit.quantile
                                           )
## calculating cell knn ... done
## calculating convolved matrices ... done
## fitting gamma coefficients ... done. succesfful fit for 8548 genes
## filtered out 1306 out of 8548 genes due to low nmat-emat correlation
## filtered out 754 out of 7242 genes due to low nmat-emat slope
## calculating RNA velocity shift ... done
## calculating extrapolated cell state ... done
# 可視化(第5主成分までを用いたプロット)
options(repr.plot.width = 10, repr.plot.height = 10)
pca.velocity.plot(rvel.qf,
                  nPcs = 5,
                  plot.cols = 2,
                  cell.colors = ac(cell.colors, alpha=0.7),
                  cex = 1.2,
                  pcount = 0.1,
                  pc.multipliers = c(1,-1,-1,-1,-1)
                 )
## log ... pca ... pc multipliers ... delta norm ... done

## done
# show.gene オプションを使うと個々の遺伝子のフィットを可視化できる
options(repr.plot.width = 12, repr.plot.height = 4)

# ここでは時間を節約するために、事前に計算した速度(rvel.qf)を渡す
# define custom pallet for expression magnitude
gene.relative.velocity.estimates(emat, nmat,
                                 kCells = 5,
                                 fit.quantile = fit.quantile,
                                 old.fit = rvel.qf,
                                 show.gene = 'Chga',
                                 cell.emb = emb,
                                 cell.colors = cell.colors)
## calculating convolved matrices ... done

## [1] 1
  • 別のやり方として、spanning reads (smat) を遺伝子オフセットのフィットに用いることもできます。

  • これはより正確なオフセット推定値が得られますが、spanning reads(スプライスサイトにまたがるリード)はまれなため、より少ない遺伝子についての推定値となります。

  • さらに、ここではdiagona.quantiles = TRUEを用いて、スプライスされたシグナルとスプライスされていないシグナルの(正規化された)和の値を用いて極端なquantilesを推定していることに注意してください。

rvel <- gene.relative.velocity.estimates(emat,
                                         nmat,
                                         smat = smat,
                                         kCells = 5,
                                         fit.quantile = fit.quantile,
                                         diagonal.quantiles = TRUE)
## calculating cell knn ... done
## calculating convolved matrices ... done
## fitting smat-based offsets ... done
## fitting gamma coefficients ... done. succesfful fit for 1696 genes
## filtered out 26 out of 1696 genes due to low nmat-smat correlation
## filtered out 138 out of 1670 genes due to low nmat-emat correlation
## filtered out 14 out of 1532 genes due to low nmat-emat slope
## calculating RNA velocity shift ... done
## calculating extrapolated cell state ... done
# 得られた推定値を可視化
options(repr.plot.width = 10, repr.plot.height = 10)
pca.velocity.plot(rvel,
                  nPcs = 5,
                  plot.cols = 2,
                  cell.colors = ac(cell.colors,alpha=0.7),
                  cex = 1.2,
                  pcount = 0.1,
                  pc.multipliers = c(1,-1,1,1,1))
## log ... pca ... pc multipliers ... delta norm ... done

## done

以下ではkNN スムージングを用いずに relative gamma fit を用いて最も基本的なバージョンのvelocityを推定します。

rvel1 <- gene.relative.velocity.estimates(emat,
                                          nmat,
                                          deltaT = 1,
                                          deltaT2 = 1,
                                          kCells = 1,
                                          fit.quantile = fit.quantile)
## fitting gamma coefficients ... done. succesfful fit for 8548 genes
## filtered out 783 out of 8548 genes due to low nmat-emat correlation
## filtered out 1330 out of 7765 genes due to low nmat-emat slope
## calculating RNA velocity shift ... done
## calculating extrapolated cell state ... done
# 得られた推定値を可視化
options(repr.plot.width = 10, repr.plot.height = 10)
pca.velocity.plot(rvel1,
                  nPcs = 5,
                  plot.cols = 2,
                  cell.colors = ac(cell.colors,alpha=0.7),
                  cex = 1.2,
                  pcount = 0.1,
                  pc.multipliers = c(1,-1,1,1,1))
## log ... pca ... pc multipliers ... delta norm ... done

## done

tSNE上での可視化

PCA空間の代わりに、既に得られている空間上で可視化します。以下ではtSNEを用います。

vel <- rvel
arrow.scale = 3
cell.alpha = 0.4
cell.cex = 1
fig.height = 4
fig.width = 4.5

options(repr.plot.width = 6, repr.plot.height = 6)
show.velocity.on.embedding.cor(emb,
                               vel,
                               n = 100,
                               scale = 'sqrt',
                               cell.colors = ac(cell.colors,alpha=cell.alpha),
                               cex = cell.cex,
                               arrow.scale = arrow.scale,
                               arrow.lwd = 1
                              )

## delta projections ... sqrt knn ... transition probs ... done
## calculating arrows ... done

あるいは、同じ関数を使って速度ベクトル場 (velocity vector field) を計算することもできます。

show.velocity.on.embedding.cor(emb,
                               vel,
                               n = 100,
                               scale = 'sqrt',
                               cell.colors = ac(cell.colors,alpha = cell.alpha),
                               cex = cell.cex,
                               arrow.scale = arrow.scale,
                               show.grid.flow = TRUE,
                               min.grid.cell.mass = 0.5,
                               grid.n = 20,
                               arrow.lwd = 2
                              )

## delta projections ... sqrt knn ... transition probs ... done
## calculating arrows ... done
## grid estimates ... grid.sd= 1.731696  min.arrow.size= 0.03463392  max.grid.arrow.length= 0.09156871  done

Velocity estimate based on gene structure

  • 遺伝子構造パラメータに基づいてvelocityを推定するためには、遺伝子(gtf)ファイルと、exon毎のマッピング情報を含むデータが必要となる(後者は velocyto.py のデバッグ用オプション -d でhdf5出力される)。

  • ここでは正確なgtfファイルを取得する。まずはゲノム(UCSC mm10)内で予測される内部プライミングサイトの情報をコンパイルします。

# 遺伝子データダウンロード
#ip.mm10 <- readRDS(url("http://pklab.med.harvard.edu/velocyto/chromaffin/ip.mm10.rds"))
system("wget http://pklab.med.harvard.edu/velocyto/chromaffin/ip.mm10.rds")
ip.mm10 <- readRDS("ip.mm10.rds")
# ゲノムデータダウンロード
#gene.info <- readRDS(url("http://pklab.med.harvard.edu/velocyto/chromaffin/gene.info.rds"))

system("wget http://pklab.med.harvard.edu/velocyto/chromaffin/gene.info.rds")
gene.info <- readRDS("gene.info.rds")

Genome-wide model fit

# start with unfiltered matrices, as we can use more genes in these types of estimates
emat <- ldat$spliced
nmat <- ldat$unspliced
smat <- ldat$spanning
emat <- filter.genes.by.cluster.expression(emat, cell.colors, min.max.cluster.average = 7)
gvel <- global.velcoity.estimates(emat,
                                  nmat,
                                  rvel,
                                  base.df = gene.info$gene.df,
                                  smat = smat,
                                  deltaT = 1,
                                  kCells = 5,
                                  kGenes = 15,
                                  kGenes.trim = 5,
                                  min.gene.cells = 0,
                                  min.gene.conuts = 500)
## filtered out 4 out of 8990 genes due to low emat levels
## filtered out 1072 out of 8986 genes due to insufficient exonic or intronic lengths
## filtered out 204 out of 7914 genes due to excessive nascent counts
## using relative slopes for 1221 genes to fit structure-based model ... with internal priming info ... 80.5% deviance explained.
## predicting gamma ... done
## refitting offsets ... calculating cell knn ... done
## calculating convolved matrices ... done
## fitting smat-based offsets ... done
## fitting gamma coefficients ... done. succesfful fit for 7694 genes
## filtered out 1337 out of 7551 genes due to low nmat-smat correlation
## filtered out 899 out of 6214 genes due to low nmat-emat correlation
## filtered out 440 out of 5315 genes due to low nmat-emat slope
## calculating RNA velocity shift ... done
## calculating extrapolated cell state ... done
## re-estimated offsets for 6214 out of 7710 genes
## calculating convolved matrices ... done
## calculating gene knn ... done
## estimating M values ... adjusting mval offsets ... re-estimating gamma ... done
## calculating RNA velocity shift ... done
## calculating extrapolated cell state ... done
pca.velocity.plot(gvel,
                  nPcs = 5,
                  plot.cols = 2,
                  cell.colors = ac(cell.colors,alpha=0.7),
                  cex = 1.2,
                  pcount = 0.1,
                  pc.multipliers = c(1,-1,-1,1,1)
                 )
## log ... pca ... pc multipliers ... delta norm ... done

## done
par(mfrow=c(1,2),
    mar = c(2.5,2.5,2.5,1.5),
    mgp = c(2,0.65,0),
    cex = 0.85)
arrow.scale=3
cell.alpha=0.4
cell.cex=1
fig.height=4
fig.width=4.5

#pdf(file='tsne.rvel_gvel.plots.pdf',height=6,width=12)
show.velocity.on.embedding.cor(emb,
                               rvel,
                               n=100,
                               scale='sqrt',
                               cell.colors=ac(cell.colors,alpha=cell.alpha),
                               cex=cell.cex,
                               arrow.scale=arrow.scale,
                               arrow.lwd=1,
                               main='gene-relative esitmate',
                               do.par=F)
## delta projections ... sqrt knn ... transition probs ... done
## calculating arrows ... done
show.velocity.on.embedding.cor(emb,
                               gvel,
                               n=100,
                               scale='sqrt',
                               cell.colors=ac(cell.colors,alpha=cell.alpha),
                               cex=cell.cex,
                               arrow.scale=arrow.scale,
                               arrow.lwd=1,
                               main='gene-structure estimate',
                               do.par=F)

## delta projections ... sqrt knn ... transition probs ... done
## calculating arrows ... done
par(mfrow=c(1,2), mar = c(2.5,2.5,2.5,1.5), mgp = c(2,0.65,0), cex = 0.85);
x <- tSNE.velocity.plot(rvel,
                        nPcs=15,
                        cell.colors=cell.colors,
                        cex=0.9,
                        perplexity=200,
                        norm.nPcs=NA,
                        pcount=0.1,
                        scale='log',
                        do.par=F,
                        main='gene-relative estimate')
## rescaling ... log ... pca ... delta norm ... tSNE ...Performing PCA
## Read the 768 x 15 data matrix successfully!
## OpenMP is working. 144 threads.
## Using no_dims = 2, perplexity = 200.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 1.48 seconds (sparsity = 0.903880)!
## Learning embedding...
## Iteration 50: error is 45.227904 (50 iterations in 49.38 seconds)
## Iteration 100: error is 45.224233 (50 iterations in 49.24 seconds)
## Iteration 150: error is 44.400020 (50 iterations in 49.58 seconds)
## Iteration 200: error is 44.371143 (50 iterations in 49.91 seconds)
## Iteration 250: error is 44.370880 (50 iterations in 49.76 seconds)
## Iteration 300: error is 0.166686 (50 iterations in 49.99 seconds)
## Iteration 350: error is 0.158923 (50 iterations in 49.83 seconds)
## Iteration 400: error is 0.157552 (50 iterations in 46.73 seconds)
## Iteration 450: error is 0.156934 (50 iterations in 49.82 seconds)
## Iteration 500: error is 0.157107 (50 iterations in 50.19 seconds)
## Iteration 550: error is 0.156934 (50 iterations in 47.51 seconds)
## Iteration 600: error is 0.157833 (50 iterations in 49.82 seconds)
## Iteration 650: error is 0.157596 (50 iterations in 48.49 seconds)
## Iteration 700: error is 0.157438 (50 iterations in 49.69 seconds)
## Iteration 750: error is 0.157395 (50 iterations in 49.32 seconds)
## Iteration 800: error is 0.157365 (50 iterations in 48.73 seconds)
## Iteration 850: error is 0.157345 (50 iterations in 49.67 seconds)
## Iteration 900: error is 0.157384 (50 iterations in 49.86 seconds)
## Iteration 950: error is 0.157615 (50 iterations in 48.42 seconds)
## Iteration 1000: error is 0.157306 (50 iterations in 48.90 seconds)
## Fitting performed in 984.84 seconds.
## delta norm ... done
x <- tSNE.velocity.plot(gvel,
                        nPcs=15,
                        cell.colors=cell.colors,
                        cex=0.9,
                        perplexity=200,
                        norm.nPcs=NA,
                        pcount=0.1,
                        scale='log',
                        do.par=F,
                        main='gene-structure estimate')
## rescaling ... log ... pca ... delta norm ... tSNE ...Performing PCA
## Read the 768 x 15 data matrix successfully!
## OpenMP is working. 144 threads.
## Using no_dims = 2, perplexity = 200.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 1.65 seconds (sparsity = 0.905877)!
## Learning embedding...
## Iteration 50: error is 45.043481 (50 iterations in 48.21 seconds)
## Iteration 100: error is 45.043481 (50 iterations in 49.78 seconds)
## Iteration 150: error is 45.043481 (50 iterations in 49.97 seconds)
## Iteration 200: error is 45.043481 (50 iterations in 49.51 seconds)
## Iteration 250: error is 45.043481 (50 iterations in 49.16 seconds)
## Iteration 300: error is 0.162788 (50 iterations in 50.85 seconds)
## Iteration 350: error is 0.154705 (50 iterations in 49.93 seconds)
## Iteration 400: error is 0.155252 (50 iterations in 49.04 seconds)
## Iteration 450: error is 0.155771 (50 iterations in 48.26 seconds)
## Iteration 500: error is 0.156125 (50 iterations in 50.32 seconds)
## Iteration 550: error is 0.155698 (50 iterations in 50.35 seconds)
## Iteration 600: error is 0.155528 (50 iterations in 50.19 seconds)
## Iteration 650: error is 0.155615 (50 iterations in 50.17 seconds)
## Iteration 700: error is 0.155632 (50 iterations in 50.18 seconds)
## Iteration 750: error is 0.155846 (50 iterations in 48.55 seconds)
## Iteration 800: error is 0.155829 (50 iterations in 47.32 seconds)
## Iteration 850: error is 0.155994 (50 iterations in 47.90 seconds)
## Iteration 900: error is 0.156227 (50 iterations in 50.31 seconds)
## Iteration 950: error is 0.156040 (50 iterations in 49.59 seconds)
## Iteration 1000: error is 0.156077 (50 iterations in 48.87 seconds)
## Fitting performed in 988.46 seconds.
## delta norm ... done

# Cell trajectory modeling
# 注!計算量が大きいです
x <- show.velocity.on.embedding.eu(emb,
                                   rvel,
                                   n = 40,
                                   scale = 'sqrt',
                                   cell.colors = ac(cell.colors,alpha=cell.alpha),
                                   cex = cell.cex,
                                   nPcs = 30,
                                   sigma = 2.5,
                                   show.trajectories = TRUE,
                                   diffusion.steps = 400,
                                   n.trajectory.clusters = 15,
                                   ntop.trajectories = 1,
                                   embedding.knn = T,
                                   control.for.neighborhood.density = TRUE,
                                   n.cores = 4
                                  )

## sqrt scale ... reducing to 30 PCs ... distance ... sigma= 2.5  beta= 1  transition probs ... embedding kNN ... done
## simulating diffusion ... constructing path graph ... tracing shortest trajectories ... clustering ... done.